***************************************
***		Do-File to impute wages		***
***		above censoring limit		***
***		Project Vertical			***
***		Last change: 20.12.2017 fo	***
***************************************

clear
set more off
capture log close
log using "$log/vert_03_impute_v6.log", replace
use "$data/linked_2002-2008.dta", clear

*********************************
*** identify censoring limits ***
*********************************

* identify censored observations
gen highwages = tentgelt if tentgelt > 80	/* censoring limit is always > 80 */

* censoring limit is the most frequent wage, to account for rounding errors, slightly (3EUR) below --> make sure this also works for east and west berlin!
bysort east year: egen bbmg = mode(highwages)
drop highwages


******************************
*** impute censored values ***

gen cwage = tentgelt
replace cwage = bbmg - 3 if cwage >= bbmg - 3

* indicator for censored obs
gen cens=.
replace cens = 1 if cwage >= bbmg - 3
replace cens = 0 if cwage < bbmg - 3

* for robustness check: impute 1.5 censoring limit
gen fimpwage = .
replace fimpwage = cwage 		if cens==0
replace fimpwage = 1.5*(bbmg-3) if cens==1
gen lnfimpwage = ln(fimpwage)

* prepare for miimpute command
gen lolim = ln(cwage)
gen uplim = ln(cwage)
replace uplim = . if cens==1

* generate firm-level predictors: proportion censored and avg. wage
bysort estid year: gen estsize = _N
	replace   estsize = 0 if   estsize==.

bysort estid year: egen propcens = mean(cens)
	replace propcens = 0 if propcens==.

bysort estid year: egen avglnwage = mean(lolim)

gen lnbesch_gesamt02 = ln(besch_gesamt02)

save "$data/linked_imputed_2002-2008.dta", replace

***********************************************************
***		imputation: separately by east/west and year	***
***********************************************************
set more off
set matsize 2000
foreach east in 0 1 {
	forvalues year = 2002/2008		{

			dis " "
			dis "East:" `east' ", Year:" `year'
			dis " "

			* open dataset
			use "$data/linked_imputed_2002-2008.dta" if east==`east' & year==`year', clear
			svyset [pw=csweight]	/* this is the right weight */

			* indicator for one person establishments, replace with mean of small establishments
			qui svy: mean propcens if estsize !=1 & estsize < 30
			qui replace propcens = _b[propcens] if estsize == 1
			qui svy: mean avglnwage if estsize !=1 & estsize < 30
			qui replace avglnwage = _b[avglnwage] if estsize == 1

			mi set wide
			/* impute with force option because very few very small occupations cause perfect collinearity. obs in these occs get miss vals */
			mi impute intreg milnwage foreigner i.workertype age age2 age3 exper_estab exper_estab2 exper_estab3 exper exper2 exper3 ///
				female experxfemale exper2xfemale exper3xfemale ///
				b1.bildung b1.bildung#c.exper b1.bildung#c.exper2 b1.bildung#c.exper3 ///
				besch_gesamt02 lnbesch_gesamt02 propcens avglnwage i.w93_1 i.bula, ll(lolim) ul(uplim) add(1) rseed(37169) force

			replace milnwage = _1_milnwage
			mi unset
			drop milnwage_1_ lolim_1_ uplim_1_ mi_miss lolim uplim
			save "$data/imp_`east'_`year'.dta", replace
			clear
			}
		}


***	Stack into one, rename vars, unformat mi ***
set matsize 400
clear
foreach east in 0 1		{
		forvalues year = 2002/2008		{
			qui append using "$data/imp_`east'_`year'.dta"
			erase "$data/imp_`east'_`year'.dta"
			}
		}


*** top code at 3x censoring limit ***
replace milnwage = ln(3*(bbmg - 3)) if milnwage > ln(3*(bbmg - 3))

* save
order estid year
sort estid year
compress
save "$data/linked_imputed_2002-2008.dta", replace


*** convert wages into 2010 euros (i.e. into real wages) ***
use "$orig\deu_cpindex_91-13.dta", clear

merge 1:m year using "$data/linked_imputed_2002-2008.dta"
	drop if year > 2011 | year < 2002
gen mirealwage = exp(milnwage) * (100/cpindex)
replace milnwage = ln(mirealwage)

drop if mirealwage < 20 /* drop persons who make less than EUR 600 per month: too little for a true full time employee */
drop if milnwage == .
drop if estid==.
drop _merge cpindex cpindex_change mirealwage

* Proportion censored
mean cens
mean cens [pw=persweight]

mean cens if college==1
mean cens [pw=persweight] if college==1

mean cens if college==0
mean cens [pw=persweight] if college==0

misstable summarize milnwage befristet100 leiharb100 freie100 os_panel , all

sort estid year persnr
save "$data/linked_imputed_2002-2008.dta", replace
erase "$data/linked_2002-2008.dta"


*** END ***
log close








































